# Clean up APNs for older CA fires

pacman::p_load(stringdist, data.table, dplyr, fst)

## Preliminaries
rm(list = ls())

source("code/globals.R")

damage <- readRDS(file.path(WORKING, "damage_CA_older.Rds")) %>%
  data.table()

dam07 <- damage %>% dplyr::filter(year(date) == 2007)
dam08 <- damage %>% dplyr::filter(year(date) == 2008)
dam09 <- damage %>% dplyr::filter(year(date) == 2009)
# dam11<-damage %>% dplyr::filter(year(date)==2011) This would be Canyon Fire, but none of those records have APN numbers; would need to merge on geographic coords.
dam12 <- damage %>% dplyr::filter(year(date) == 2012)

attr07 <- read_fst(file.path(WORKING, "ztraxdata", "CAattr2007.fst"), as.data.table = TRUE)
attr08 <- read_fst(file.path(WORKING, "ztraxdata", "CAattr2008.fst"), as.data.table = TRUE)
attr09 <- read_fst(file.path(WORKING, "ztraxdata", "CAattr2009.fst"), as.data.table = TRUE)
# attr11<-read_fst(file.path(WORKING,'ztraxdata',"CAattr2011.fst"),as.data.table=TRUE)
attr12 <- read_fst(file.path(WORKING, "ztraxdata", "CAattr2012.fst"), as.data.table = TRUE)

## Confirm unique IDs in all datasets
dlist <- list(dam07, dam08, dam09, dam12)
alist <- list(attr07, attr08, attr09, attr12)
for (y in 1:length(dlist)) {
  print(y)
  z <- dlist[[y]]
  stopifnot(anyDuplicated(z, by = c("FIPS", "UnformattedAssessorParcelNumber")) == 0)
  zz <- alist[[y]]
  stopifnot(anyDuplicated(zz, by = c("ImportParcelID", "BuildingOrImprovementNumber")) == 0)
  rm(z, zz)
}
rm(dlist, alist, y)

## Function to inspect IDs by county in each dataset. Looks for approx string matches on address; then use that to compare APNs
inspectAPN <- function(targetfips, damagefile, attrfile) {
  dd <- damagefile[FIPS == targetfips, c("UnformattedAssessorParcelNumber", "Address", "NotesField")]
  dd$Address <- tolower(dd$Address)
  aa <- attrfile[FIPS == targetfips, c(
    "BuildingOrImprovementNumber", "UnformattedAssessorParcelNumber", "PropertyFullStreetAddress",
    "NoOfUnits", "PropertyLandUseStndCode"
  )]
  aa$address <- tolower(aa$PropertyFullStreetAddress)
  names(dd)[names(dd) == "UnformattedAssessorParcelNumber"] <- "UnformattedAPN_DINS"
  names(dd)[names(dd) == "Address"] <- "address_DINS"
  ## For 100 random fire records, find closest address string in ZTrax
  set.seed(16473)
  dd$rand <- runif(nrow(dd))
  dd <- dd[order(dd$rand), ]
  if (nrow(dd) > 100) {
    dd <- dd[1:100, ]
  }
  table <- data.frame(matrix(nrow = nrow(dd), ncol = 5, data = NA))
  ## Going house by house in the DINS sample, subset Ztrax to only records with identical house NUMBER, then find closest string for full address.
  # for (i in seq(1,nrow(dd))) {
  #   exacthousenumbermatches<-aa[aa$PropertyHouseNumber == dd[i,STREETNUMBER]]
  #   bestmatch<-amatch(dd[i,address_DINS],exacthousenumbermatches$address,maxDist=100,method='dl')
  #   tablerow<-cbind(dd[i,c('UnformattedAPN_DINS','APN_DINS','address_DINS','STRUCTURETYPE','howmanystructures','badrecordflag')],
  #                   exacthousenumbermatches[bestmatch,c('PropertyFullStreetAddress','UnformattedAssessorParcelNumber','AssessorParcelNumber')])
  #   tablerow$stringdist<-stringdist(tolower(tablerow$address_DINS),tolower(tablerow$PropertyFullStreetAddress),method='dl')
  #   table[i,]<-tablerow
  # }
  ## DO THIS WITHOUT EXACT HOUSE NUMBER MATCHES TO AVOID HAVING TO EXTRACT HOUSE NUMBERS FOR COUNTIES WHERE NOT PROVIDED SEPARATELY
  for (i in seq(1, nrow(dd))) {
    bestmatch <- amatch(dd[i, address_DINS], aa$address, maxDist = 100, method = "dl")
    tablerow <- cbind(
      dd[i, c("UnformattedAPN_DINS", "address_DINS")],
      aa[bestmatch, c("PropertyFullStreetAddress", "UnformattedAssessorParcelNumber")]
    )
    tablerow$stringdist <- stringdist(tolower(tablerow$address_DINS), tolower(tablerow$PropertyFullStreetAddress), method = "dl")
    table[i, ] <- tablerow
  }

  names(table) <- names(tablerow)
  table <- table[, c(
    "stringdist", "UnformattedAssessorParcelNumber", "UnformattedAPN_DINS",
    "PropertyFullStreetAddress",
    "address_DINS"
  )]
  table <- table[order(table$stringdist), ]
  table$lengthUAPN_DINS <- nchar(table$UnformattedAPN_DINS)
  table$lengthUAPN_Ztrax <- nchar(table$UnformattedAssessorParcelNumber)
  return(table)
}

## Function to check how many string address matches are perfectly match on UAPN
checkAPN <- function(targetfips, damagefile, attrfile) {
  v <- inspectAPN(targetfips, damagefile, attrfile)
  v <- v[v$stringdist <= 10 & !is.na(v$stringdist), ]
  print(nrow(v[v$UnformattedAssessorParcelNumber == v$UnformattedAPN_DINS, ]) / nrow(v))
  v <- v[v$stringdist <= 8 & !is.na(v$stringdist), ]
  print(nrow(v[v$UnformattedAssessorParcelNumber == v$UnformattedAPN_DINS, ]) / nrow(v))
}

############ 2007 incidents

dam07[, .N, by = FIPS]

# 06071
inspectAPN(6071, dam07, attr07)
dam07 <- dam07 %>%
  mutate(UnformattedAssessorParcelNumber = if_else(FIPS == 6071,
    paste0("0", substr(UnformattedAssessorParcelNumber, 1, 8)),
    UnformattedAssessorParcelNumber
  ))
checkAPN(6071, dam07, attr07)

############ 2008 incidents

dam08[, .N, by = FIPS]

# 6087
inspectAPN(6087, dam08, attr08)
dam08 <- dam08 %>%
  mutate(UnformattedAssessorParcelNumber = if_else(FIPS == 6087,
    gsub("-", "", UnformattedAssessorParcelNumber, fixed = TRUE),
    UnformattedAssessorParcelNumber
  ))
checkAPN(6087, dam08, attr08)

# 6007
inspectAPN(6007, dam08, attr08)
dam08 <- dam08 %>%
  mutate(UnformattedAssessorParcelNumber = case_when( ## 2 variations: WITH dashes and already including leading zero; or WITHOUT dashes and missing leading zero.
    FIPS == 6007 & grepl("-", UnformattedAssessorParcelNumber) ~ gsub("-", "", UnformattedAssessorParcelNumber, fixed = TRUE),
    FIPS == 6007 & !grepl("-", UnformattedAssessorParcelNumber) ~ paste0("0", UnformattedAssessorParcelNumber, fixed = TRUE),
    FIPS != 6007 ~ UnformattedAssessorParcelNumber
  ))
checkAPN(6007, dam08, attr08)

############### 2009 incidents

dam09[, .N, by = FIPS]

# 6061
inspectAPN(6061, dam09, attr09)
dam09 <- dam09 %>%
  mutate(UnformattedAssessorParcelNumber = if_else(FIPS == 6061,
    gsub("-", "", UnformattedAssessorParcelNumber, fixed = TRUE),
    UnformattedAssessorParcelNumber
  ))
checkAPN(6061, dam09, attr09)

# 6087
inspectAPN(6087, dam09, attr09)
dam09 <- dam09 %>%
  mutate(UnformattedAssessorParcelNumber = if_else(FIPS == 6087,
    gsub("-", "", UnformattedAssessorParcelNumber, fixed = TRUE),
    UnformattedAssessorParcelNumber
  ))
checkAPN(6087, dam09, attr09)

#################### 2012 incidents

dam12[, .N, by = FIPS]

# 6103
inspectAPN(6103, dam12, attr12)
dam12 <- dam12 %>%
  mutate(UnformattedAssessorParcelNumber = if_else(FIPS == 6103,
    gsub("-", "", UnformattedAssessorParcelNumber, fixed = TRUE),
    UnformattedAssessorParcelNumber
  )) %>%
  mutate(UnformattedAssessorParcelNumber = if_else(FIPS == 6103,
    substr(UnformattedAssessorParcelNumber, 1, 8),
    UnformattedAssessorParcelNumber
  ))
checkAPN(6103, dam12, attr12)

# 6089
inspectAPN(6089, dam12, attr12)
dam12 <- dam12 %>%
  mutate(UnformattedAssessorParcelNumber = if_else(FIPS == 6089,
    paste0(
      substr(UnformattedAssessorParcelNumber, 1, 5),
      substr(UnformattedAssessorParcelNumber, 7, 9)
    ),
    UnformattedAssessorParcelNumber
  ))
checkAPN(6089, dam12, attr12)

# 6073 - no changes needed
inspectAPN(6073, dam12, attr12)
checkAPN(6073, dam12, attr12)






## -- Confirm UAPN uniquely identifies properties on the ZTRAX side (this is a problem in some newer vintages of data, does not seem to be here)

damagefiles <- list(dam07, dam08, dam09, dam12)
attrfiles <- list(attr07, attr08, attr09, attr12)

for (p in seq_len(length(attrfiles))) {
  print(p)
  a <- merge(attrfiles[[p]], damagefiles[[p]], by = c("UnformattedAssessorParcelNumber", "FIPS"))
  print(anyDuplicated(a, by = c("UnformattedAssessorParcelNumber", "FIPS")) == 0)
}
rm(damagefiles, attrfiles)

#########################################################
############ Save the cleaned files #####################
#########################################################
#- set up 2 big NAMED lists of data frames to save

cleandamagelist <- list(dam07, dam08, dam09, dam12)

cleanattrlist <- list(attr07, attr08, attr09, attr12)

#--- Limit ZTRAX data to just counties in damage data
limitztrax <- function(damagefile, attrfile) {
  y <- attrfile[attrfile$FIPS %in% unique(damagefile$FIPS), ]
  return(y)
}
for (i in seq_len(length(cleanattrlist))) {
  cleanattrlist[[i]] <- limitztrax(cleandamagelist[[i]], cleanattrlist[[i]])
}
rm(i)
## confirm this gives the same result as simpler way:
# z<-limitztrax(damage_wa_20,attr_wa_20)
# stopifnot(nrow(z)==nrow(cleanattrlist[[10]]))
# stopifnot(z==cleanattrlist[[10]] | is.na(z))


write_fst(dam07, file.path(WORKING, "damage_ca_07_cleaned.fst"))
write_fst(dam08, file.path(WORKING, "damage_ca_08_cleaned.fst"))
write_fst(dam09, file.path(WORKING, "damage_ca_09_cleaned.fst"))
write_fst(dam12, file.path(WORKING, "damage_ca_12_cleaned.fst"))


write_fst(attr07, file.path(WORKING, "attr_ca_07_cleaned.fst"))
write_fst(attr08, file.path(WORKING, "attr_ca_08_cleaned.fst"))
write_fst(attr09, file.path(WORKING, "attr_ca_09_cleaned.fst"))
write_fst(attr12, file.path(WORKING, "attr_ca_12_cleaned.fst"))
